##################################
# Load & format data & libraries #
##################################

CEACAM1_formated_data <- read.csv("CEACAM1_formated_data.csv")
  View(CEACAM1_formated_data)

working_df <- CEACAM1_formated_data
working_df[working_df == "TRUE"] <- "T"
working_df[working_df == ""] <- NA

library(ggplot2)

# formats the population frequency/counts into usable numeric columns 
AFR_split <- as.data.frame(strsplit(as.character(working_df$AFR), " "))
AFR_count = NULL
for (i in seq(1, ncol(AFR_split))) {AFR_count <- c(AFR_count, gsub('[()]', "", as.character(AFR_split[2,i])))}
test_working_df <- cbind(working_df[,1:32], t(AFR_split[1,]))
row.names(test_working_df) <- c(seq(nrow(test_working_df)))
colnames(test_working_df)[ncol(test_working_df)] <- "AFR"
test_working_df <- cbind(test_working_df, as.numeric(AFR_count))
colnames(test_working_df)[ncol(test_working_df)] <- "AFR_count"

AMR_split <- as.data.frame(strsplit(as.character(working_df$AMR..AMR.count.), " "))
AMR_count = NULL
for (i in seq(1, ncol(AMR_split))) {AMR_count <- c(AMR_count, gsub('[()]', "", as.character(AMR_split[2,i])))}
test_working_df <- cbind(test_working_df, t(AMR_split[1,]))
row.names(test_working_df) <- c(seq(nrow(test_working_df)))
colnames(test_working_df)[ncol(test_working_df)] <- "AMR"
test_working_df <- cbind(test_working_df, as.numeric(AMR_count))
colnames(test_working_df)[ncol(test_working_df)] <- "AMR_count"

EAS_split <- as.data.frame(strsplit(as.character(working_df$EAS..EAS.count), " "))
EAS_count = NULL
for (i in seq(1, ncol(EAS_split))) {EAS_count <- c(EAS_count, gsub('[()]', "", as.character(EAS_split[2,i])))}
test_working_df <- cbind(test_working_df, t(EAS_split[1,]))
row.names(test_working_df) <- c(seq(nrow(test_working_df)))
colnames(test_working_df)[ncol(test_working_df)] <- "EAS"
test_working_df <- cbind(test_working_df, as.numeric(EAS_count))
colnames(test_working_df)[ncol(test_working_df)] <- "EAS_count"

EUR_split <- as.data.frame(strsplit(as.character(working_df$EUR..EUR.count), " "))
EUR_count = NULL
for (i in seq(1, ncol(EUR_split))) {EUR_count <- c(EUR_count, gsub('[()]', "", as.character(EUR_split[2,i])))}
test_working_df <- cbind(test_working_df, t(EUR_split[1,]))
row.names(test_working_df) <- c(seq(nrow(test_working_df)))
colnames(test_working_df)[ncol(test_working_df)] <- "EUR"
test_working_df <- cbind(test_working_df, as.numeric(EUR_count))
colnames(test_working_df)[ncol(test_working_df)] <- "EUR_count"

SAS_split <- as.data.frame(strsplit(as.character(working_df$SAS..SAS.count), " "))
SAS_count = NULL
for (i in seq(1, ncol(SAS_split))) {SAS_count <- c(SAS_count, gsub('[()]', "", as.character(SAS_split[2,i])))}
test_working_df <- cbind(test_working_df, t(SAS_split[1,]))
row.names(test_working_df) <- c(seq(nrow(test_working_df)))
colnames(test_working_df)[ncol(test_working_df)] <- "SAS"
test_working_df <- cbind(test_working_df, as.numeric(SAS_count))
colnames(test_working_df)[ncol(test_working_df)] <- "SAS_count"

GGVP_split <- as.data.frame(strsplit(as.character(working_df$GGVP.ALL), " "))
GGVP_count = NULL
for (i in seq(1, ncol(GGVP_split))) {GGVP_count <- c(GGVP_count, gsub('[()]', "", as.character(GGVP_split[2,i])))}
test_working_df <- cbind(test_working_df, t(GGVP_split[1,]))
row.names(test_working_df) <- c(seq(nrow(test_working_df)))
colnames(test_working_df)[ncol(test_working_df)] <- "GGVP"
test_working_df <- cbind(test_working_df, as.numeric(GGVP_count))
colnames(test_working_df)[ncol(test_working_df)] <- "GGVP_count"

rm(AFR_count, AFR_split, AMR_count, AMR_split, EAS_count, EAS_split, EUR_count, EUR_split, GGVP_count, GGVP_split, SAS_count, SAS_split)

#gets out only variant sites for each haplotype and prints them as a vector
#also puts all unquie vectors into a single list
#column 25 is the last column for sites within the N-domain for CEACAM1

haplotypes <- list()
hap_strings <- NULL

for (z in seq(nrow(test_working_df))){var_vect <- NULL
    for(i in test_working_df[z,1:25]){
        if((is.na(i) == FALSE) & (class(i) == "integer")) {
            var_vect <- c(var_vect, i)
        }
    }
    if(!(paste(var_vect, collapse = "") %in% hap_strings)){
        haplotypes[[length(haplotypes)+1]] <- c(list(var_vect), z); 
        hap_strings <- c(hap_strings, paste(var_vect, collapse = "")); 
        print(var_vect)}
        else{
            index <- which(hap_strings == paste(var_vect, collapse = "") )
            haplotypes[[index]][[2]] <- append(haplotypes[[index]][[2]], z )
        }
    }

#changes values in the pop. columns of test_working_df from factors to numeric to allow math
for(j in seq(33,43,2)){
    test_working_df[,j] <- as.numeric(as.character(test_working_df[,j]))
}

#combines counts and frequency values for identical N-domain haplotypes
#Note: there are multiple N-domain haplotypes, because originally the haplotypes included variants outside of the N-domain as well

hap_df <- NULL

for(z in seq(length(haplotypes))){
    total_freqs <- colSums(test_working_df[unlist(haplotypes[[z]][2]), 31:44])
    new_row <- c(paste(unlist(haplotypes[[z]][1]), collapse = ", "), total_freqs)
    hap_df <- rbind(hap_df, new_row)
}

hap_df <- as.data.frame(hap_df)
colnames(hap_df)[1] <- "var_sites"

# changes all count and frequency values in hap_df from factors to numeric
for(k in seq(2, 15)){
    hap_df[,k] <- as.numeric(as.character(hap_df[,k]))
}

rm(hap_strings, i, j, k, index, new_row, total_freqs, var_vect, z)

# get reference sequence data

total_pop_counts <- c(1322,694, 1008, 1006, 978, 1010)
total_pop_counts <- t(as.data.frame(total_pop_counts))
pop_names <- c("AFR", "AMR", "EAS", "EUR", "SAS", "GGVP")
colnames(total_pop_counts) <- pop_names
total_pop_counts <- as.data.frame(total_pop_counts)


ref_freq <- 1- sum(hap_df$Frequency)
ref_count <- 6018 - sum(hap_df$count)
ref_data <- cbind("REF", ref_freq, ref_count)


for(i in pop_names){
    pop_freq <- 1 - sum(hap_df[[i]])
    count_local <- which(colnames(hap_df) == i) + 1
    pop_count <- total_pop_counts[[i]] - sum(hap_df[[count_local]])
    ref_data <- cbind(ref_data, pop_freq, pop_count)
}

ref_data <- as.data.frame(ref_data)

for(k in seq(2, 15)){
    ref_data[,k] <- as.numeric(as.character(ref_data[,k]))
}


colnames(ref_data) <- colnames(hap_df)
ref_and_haps <- rbind(ref_data, hap_df)

#adds a new empty column named "group"
ref_and_haps[, "group"] <- NA

#Identify haplotypes with SNPs which change human CEACAM1 sites to match sites in CEACAM3 or CEACAM5
major_hap_sites <- c(103, 183, 233, 248, 283, 333, 350, 369) 

for(thingy in seq(nrow(ref_and_haps))){
    sites <- strsplit(as.character(ref_and_haps$var_sites[thingy]), ", ")[[1]]
    if("REF" %in% sites){
        ref_and_haps$group[thingy] <- "REF" 
    }
    else if (103 %in% sites | 248 %in% sites | 369 %in% sites) {
       ref_and_haps$group[thingy] <- "major"
    }
    else if (length(intersect(sites, major_hap_sites)>0)) {
       ref_and_haps$group[thingy] <- "major"
    }
    else {
       ref_and_haps$group[thingy] <- "minor"
    }
}


#minor alleles refers to haplotypes that do not contain any of the SNPs which change the reference CEACAM1 sequence to match CEACAM3 or CEACAM5
#combine minor alleles into a single catergory

minor_df <- ref_and_haps[which(ref_and_haps$group == "minor"),]
minor_sums <- colSums(minor_df[2:(ncol(minor_df)-1)])
minor_sums <- cbind("Minor", t(minor_sums), "minor")
minor_sums <- as.data.frame(minor_sums)
colnames(minor_sums) <- colnames(ref_and_haps)
for(k in seq(2, 15)){
    minor_sums[,k] <- as.numeric(as.character(minor_sums[,k]))
}

#split major_df between variants
major_df <- ref_and_haps[which(ref_and_haps$group == "major"),]

#for each possible combination of high frequency SNPs (103, 248, 369)
#identifies which haplotypes contain all three SNPs (full_hap), one of 
#or different combinations of SNPs 
for(i in seq(nrow(major_df))) {
    sites <- strsplit(as.character(major_df$var_sites[i]), ", ")[[1]]
    if(103 %in% sites & 248  %in% sites & 369  %in% sites){
        major_df$group[i] <- "full_hap"}
    
    if(103 %in% sites & !(248 %in% sites) & !(369 %in% sites)){
        major_df$group[i] <- "only103"
    }
    if(248  %in% sites & !(103 %in% sites) & !(369 %in% sites)){
    major_df$group[i] <- "only248"
    }
    if(369 %in% sites & !(103 %in% sites) & !(248 %in% sites)){
    major_df$group[i] <- "only369"
    }
    if(103 %in% sites & 248 %in% sites & !(369 %in% sites)){
        major_df$group[i] <- "pair103_248"
    }
    if(103 %in% sites & !(248 %in% sites) & 369 %in% sites){
        major_df$group[i] <- "pair103_369"
    }
    if(!(103 %in% sites) & 248 %in% sites & 369 %in% sites){
        major_df$group[i] <- "pair248_369"
    }
    else if (!(103 %in% sites) & !(248 %in% sites) & !(369 %in% sites)) {
       major_df$group[i] <- "otherCCM-like"
    }
    }

#1 - full hap
# for all haplotypes with all three high frequency snps determines the total frequency of such haplotypes
full_hap_sum <-  colSums(major_df[which(major_df$group == "full_hap"),2:(ncol(major_df)-1)])
full_hap_sum <- cbind("103,248,369", t(full_hap_sum), "major")
full_hap_sum <- as.data.frame(full_hap_sum)
colnames(full_hap_sum) <- colnames(ref_and_haps)

#combines haplotypes with 1 or 2 variants (3 variants is "full haplotype")
#2
only1var <-  colSums(major_df[which(major_df$group == "only103" | major_df$group == "only248" | major_df$group == "only369"),2:(ncol(major_df)-1)])
only1var <- cbind("only1var", t(only1var), "minor")
only1var <- as.data.frame(only1var)
colnames(only1var) <- colnames(ref_and_haps)

#3
only2var <-  colSums(major_df[which(major_df$group == "pair103_248" | major_df$group == "pair103_369" | major_df$group == "pair248_369"),2:(ncol(major_df)-1)])
only2var <- cbind("only2var", t(only2var), "minor")
only2var <- as.data.frame(only2var)
colnames(only2var) <- colnames(ref_and_haps)

#4
otherCCM_like <-  colSums(major_df[which(major_df$group == "otherCCM-like"),2:(ncol(major_df)-1)])
otherCCM_like <- cbind("otherCCM-like", t(otherCCM_like), "minor")
otherCCM_like <- as.data.frame(otherCCM_like)
colnames(otherCCM_like) <- colnames(ref_and_haps)

#combines above calculations into a single table
compressed_df <- rbind(ref_and_haps[which(ref_and_haps$group == "REF"),], minor_sums, full_hap_sum, only1var, only2var, otherCCM_like)

for(k in seq(2, 15)){
    compressed_df[,k] <- as.numeric(compressed_df[,k])
}

#sums frequcies of alleles that don't contain CEACAM-like snps
non_major <- cbind("non_major", t(colSums(compressed_df[which(compressed_df$group == "minor"),2:15])), "non_major")
colnames(non_major) <- colnames(compressed_df)
compressed_df <- rbind(compressed_df[which(compressed_df$group == "REF"),], compressed_df[which(compressed_df$group == "major"),], non_major)

for(k in seq(2, 15)){
    compressed_df[,k] <- as.numeric(compressed_df[,k])
}

#Code to make pie charts
# To modify below code for data for individual population in the first line starting with "barplot" change y = Frequency to the code for the appropriate population.
# population codes: AFR = Africa, EAS = East Asia, EUR = Europe, SAS = South Asia, GGVP = Gambia
barplot <- ggplot(compressed_df, aes(x = "", y = Frequency, fill = group)) + geom_bar(width = 1, stat = "identity", color = "black", size = 2, show.legend = FALSE) + scale_fill_manual(values = c( "#429e64", "#8dc4a2", "gold"))
pie_chart <- barplot + coord_polar("y", start = 0)
pie_chart + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_rect(fill = "transparent",colour = NA),plot.background = element_rect(fill = "transparent",colour = NA), axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank())
ggsave("pie.pdf", width = 500, height = 500, units = "mm")